// craw master module
package spider

import (
	"math/rand"
	"net/http"

	"dense_spider/core/common/extract"
	"dense_spider/core/common/filter"
	"dense_spider/core/common/mlog"
	"dense_spider/core/common/page"
	"dense_spider/core/common/page_items"
	"dense_spider/core/common/request"
	"dense_spider/core/common/resource_manage"
	"dense_spider/core/downloader"
	"dense_spider/core/history"
	"dense_spider/core/page_processer"
	"dense_spider/core/pipeline"
	"dense_spider/core/scheduler"
	//"net/http"
	nurl "net/url"
	"time"
	//"fmt"
)

type Spider struct {
	taskname string

	pPageProcesser page_processer.PageProcesser

	pDownloader downloader.Downloader

	pScheduler scheduler.Scheduler

	pHistory history.History

	pPiplelines []pipeline.Pipeline

	mc resource_manage.ResourceManage

	threadnum uint

	exitWhenComplete bool

	// Sleeptype can be fixed or rand.
	startSleeptime uint
	endSleeptime   uint
	sleeptype      string
	inhost         bool   //是否只采集同域名下的链接
	maxCrawlDepth  uint   //url地址层级支持
	userAgent      string //userAgent

	excludeFilter []filter.IFilter
	includeFilter []filter.IFilter
	baseUrl       string //爬虫入口地址

	extractRules []extract.ExtractRule
}

func (this *Spider) SetUserAgent(uAgent string) *Spider {
	this.userAgent = uAgent
	println(this.userAgent)
	return this
}
func (this *Spider) GetUserAgent() string {
	return this.userAgent
}

//获取爬虫抽取规则
func (this *Spider) GetExtractRules() []extract.ExtractRule {

	return this.extractRules
}

//设置是否只采集同域名下的链接
func (this *Spider) SetInHost(inhost bool) *Spider {
	this.inhost = inhost
	return this
}

//设置url 采集深度层级
func (this *Spider) SetMaxCrawlDepth(mDepth uint) *Spider {
	this.maxCrawlDepth = mDepth
	return this
}

//获取url 采集深度层级
func (this *Spider) GetMaxCrawlDepth() uint {
	return this.maxCrawlDepth
}

//设置排除的url
func (this *Spider) AddExcludeFilter(e filter.IFilter) *Spider {
	this.excludeFilter = append(this.excludeFilter, e)
	return this
}

//设置包含的url
func (this *Spider) AddIncludeFilter(i filter.IFilter) *Spider {
	this.includeFilter = append(this.includeFilter, i)
	return this
}

//是否是相同域名的url
func (this *Spider) IsExternalUrl(baseUrl string) bool {
	uri, err := nurl.Parse(baseUrl)
	if err != nil {
		return false
	}
	baseUri, err := nurl.Parse(baseUrl)
	if err != nil {
		return false
	}
	if baseUri.Host == uri.Host {
		return true
	}
	return false
}
func (this *Spider) IsExcludeUrl(url string) bool {
	if this.excludeFilter != nil && len(this.excludeFilter) > 0 {
		for i := 0; i < len(this.excludeFilter); i++ {
			match := this.excludeFilter[i].Match(url)
			if match {
				return true
			}
		}
		return false
	}
	return true
}
func (this *Spider) IsIncludeUrl(url string) bool {
	if this.includeFilter != nil && len(this.includeFilter) > 0 {
		for i := 0; i < len(this.includeFilter); i++ {
			match := this.includeFilter[i].Match(url)
			if match {
				return true
			}
		}
		return false
	}
	return true
}

// Spider is scheduler module for all the other modules, like downloader, pipeline, scheduler and etc.
// The taskname could be empty string too, or it can be used in Pipeline for record the result crawled by which task;
func NewSpider(pageinst page_processer.PageProcesser, taskname string, url string, extractRules []extract.ExtractRule, userAgent string) *Spider {
	mlog.StraceInst().Open()
	if url != "" {
		_, err := nurl.Parse(url)
		if err != nil {
			panic("非法Url地址")
		}
	}
	ap := &Spider{taskname: taskname, pPageProcesser: pageinst, baseUrl: url, extractRules: extractRules}

	// init filelog.
	ap.CloseFileLog()
	ap.exitWhenComplete = true
	ap.sleeptype = "fixed"
	ap.startSleeptime = 0
	ap.userAgent = userAgent
	// init spider
	if ap.pScheduler == nil {
		ap.SetScheduler(scheduler.NewQueueScheduler(false))
	}

	if ap.pDownloader == nil {
		ap.SetDownloader(downloader.NewHttpDownloader())
	}

	ap.pPiplelines = make([]pipeline.Pipeline, 0)

	ap.AddRequest(request.NewRequest(url, "html", "", "GET", 0, "", nil, nil, nil))

	mlog.StraceInst().Println("** start spider **")

	return ap
}

func (this *Spider) Taskname() string {
	return this.taskname
}

// Deal with one url and return the PageItems.
func (this *Spider) Get(url string, respType string) *page_items.PageItems {
	req := request.NewRequest(url, respType, "", "GET", 0, "", nil, nil, nil)
	return this.GetByRequest(req)
}

// Deal with several urls and return the PageItems slice.
func (this *Spider) GetAll(urls []string, respType string) []*page_items.PageItems {
	for _, u := range urls {
		req := request.NewRequest(u, respType, "", "GET", 0, "", nil, nil, nil)
		this.AddRequest(req)
	}

	pip := pipeline.NewCollectPipelinePageItems()
	this.AddPipeline(pip)

	this.Run()

	return pip.GetCollected()
}

// Deal with one url and return the PageItems with other setting.
func (this *Spider) GetByRequest(req *request.Request) *page_items.PageItems {
	var reqs []*request.Request
	reqs = append(reqs, req)
	items := this.GetAllByRequest(reqs)
	if len(items) != 0 {
		return items[0]
	}
	return nil
}

// Deal with several urls and return the PageItems slice
func (this *Spider) GetAllByRequest(reqs []*request.Request) []*page_items.PageItems {
	// push url
	for _, req := range reqs {
		//req := request.NewRequest(u, respType, urltag, method, postdata, header, cookies)
		this.AddRequest(req)
	}

	pip := pipeline.NewCollectPipelinePageItems()
	this.AddPipeline(pip)

	this.Run()

	return pip.GetCollected()
}

func (this *Spider) Run() {
	if this.threadnum == 0 {
		this.threadnum = 1
	}
	this.mc = resource_manage.NewResourceManageChan(this.threadnum)

	for {
		req := this.pScheduler.Poll()

		// mc is not atomic
		if this.mc.Has() == 0 && req == nil && this.exitWhenComplete {
			mlog.StraceInst().Println("** end spider **")
			break
		} else if req == nil {
			//mlog.StraceInst().Println("scheduler is empty")
			continue
		}
		this.mc.GetOne()

		// Asynchronous fetching
		go func(*request.Request) {
			defer this.mc.FreeOne()
			//time.Sleep( time.Duration(rand.Intn(5)) * time.Second)
			mlog.StraceInst().Println("start crawl : " + req.GetUrl())
			this.PageProcess(req)
		}(req)
	}
	this.close()
}

func (this *Spider) close() {
	this.SetScheduler(scheduler.NewQueueScheduler(false))
	this.SetDownloader(downloader.NewHttpDownloader())
	this.pPiplelines = make([]pipeline.Pipeline, 0)
	this.exitWhenComplete = true
}

func (this *Spider) AddPipeline(p pipeline.Pipeline) *Spider {
	this.pPiplelines = append(this.pPiplelines, p)
	return this
}
func (this *Spider) SetHistory(h history.History) *Spider {
	this.pHistory = h
	return this
}
func (this *Spider) SetScheduler(s scheduler.Scheduler) *Spider {
	this.pScheduler = s
	return this
}

func (this *Spider) GetScheduler() scheduler.Scheduler {
	return this.pScheduler
}

func (this *Spider) SetDownloader(d downloader.Downloader) *Spider {
	this.pDownloader = d
	return this
}

func (this *Spider) GetDownloader() downloader.Downloader {
	return this.pDownloader
}

func (this *Spider) SetThreadnum(i uint) *Spider {
	this.threadnum = i
	return this
}

func (this *Spider) GetThreadnum() uint {
	return this.threadnum
}

// If exit when each crawl task is done.
// If you want to keep spider in memory all the time and add url from outside, you can set it true.
func (this *Spider) SetExitWhenComplete(e bool) *Spider {
	this.exitWhenComplete = e
	return this
}

func (this *Spider) GetExitWhenComplete() bool {
	return this.exitWhenComplete
}

// The OpenFileLog initialize the log path and open log.
// If log is opened, error info or other useful info in spider will be logged in file of the filepath.
// Log command is mlog.LogInst().LogError("info") or mlog.LogInst().LogInfo("info").
// Spider's default log is closed.
func (this *Spider) OpenFileLog(filePath string) *Spider {
	mlog.InitFilelog(true, filePath)
	return this
}

// OpenFileLogDefault open file log with default file path like "WD/log/log.2014-9-1".
func (this *Spider) OpenFileLogDefault() *Spider {
	mlog.InitFilelog(true, "")
	return this
}

// The CloseFileLog close file log.
func (this *Spider) CloseFileLog() *Spider {
	mlog.InitFilelog(false, "")
	return this
}

// The OpenStrace open strace that output progress info on the screen.
// Spider's default strace is opened.
func (this *Spider) OpenStrace() *Spider {
	mlog.StraceInst().Open()
	return this
}

// The CloseStrace close strace.
func (this *Spider) CloseStrace() *Spider {
	mlog.StraceInst().Close()
	return this
}

// The SetSleepTime set sleep time after each crawl task.
// The unit is millisecond.
// If sleeptype is "fixed", the s is the sleep time and e is useless.
// If sleeptype is "rand", the sleep time is rand between s and e.
func (this *Spider) SetSleepTime(sleeptype string, s uint, e uint) *Spider {
	this.sleeptype = sleeptype
	this.startSleeptime = s
	this.endSleeptime = e
	if this.sleeptype == "rand" && this.startSleeptime >= this.endSleeptime {
		panic("startSleeptime must smaller than endSleeptime")
	}
	return this
}

func (this *Spider) sleep() {
	if this.sleeptype == "fixed" {
		time.Sleep(time.Duration(this.startSleeptime) * time.Millisecond)
	} else if this.sleeptype == "rand" {
		sleeptime := rand.Intn(int(this.endSleeptime-this.startSleeptime)) + int(this.startSleeptime)
		time.Sleep(time.Duration(sleeptime) * time.Millisecond)
	}
}

//将页面内的URL加入请求队列：判断URL层级，判断是否站内URL，判断是否在允许的URL规则内，采集历史队列里是否已经包含该URL
func (this *Spider) AppendUrl(url string, reqType string, depth uint) *Spider {
	//println(url + ":" + strconv.Itoa(int(depth)) + ":" + strconv.FormatBool(!this.IsExternalUrl(url)) + ":" + strconv.FormatBool(this.IsExcludeUrl(url)) + ":" + strconv.FormatBool(!this.IsIncludeUrl(url)))

	if depth > this.maxCrawlDepth || !this.IsExternalUrl(url) || this.IsExcludeUrl(url) || !this.IsIncludeUrl(url) || (this.pHistory != nil && this.pHistory.Register(url)) {
		return this
	}

	req := request.NewRequest(url, reqType, "", "GET", depth, "", nil, nil, nil)
	this.AddRequest(req)
	return this
}

// add Request to Schedule
func (this *Spider) AddRequest(req *request.Request) *Spider {

	header := make(http.Header)
	header.Set("User-Agent", this.userAgent)

	req.SetHeader(header)

	if req == nil {
		mlog.LogInst().LogError("request is nil")
		return this
	} else if req.GetUrl() == "" {
		mlog.LogInst().LogError("request is empty")
		return this
	}

	this.pScheduler.Push(req)

	return this
}

//
func (this *Spider) AddRequests(reqs []*request.Request) *Spider {
	for _, req := range reqs {
		this.AddRequest(req)
	}
	return this
}

// core processer
func (this *Spider) PageProcess(req *request.Request) {
	var p *page.Page
	p = this.pDownloader.Download(req)
	if !p.IsSucc() {
		// download retry
		this.sleep()
		p = this.pDownloader.Download(req)
	}

	this.pPageProcesser.Process(p, this.extractRules, this.maxCrawlDepth)
	for _, req := range p.GetTargetRequests() {
		//fmt.Printf("%v\n",req)
		if req.GetDepth() <= this.maxCrawlDepth {
			pUrl := req.GetUrl()
			if !this.IsExternalUrl(pUrl) || this.IsExcludeUrl(pUrl) || !this.IsIncludeUrl(pUrl) || (this.pHistory != nil && this.pHistory.Register(pUrl)) {
				continue
			}

			this.AddRequest(req)
		}
	}

	// output
	if !p.GetSkip() {
		for _, pip := range this.pPiplelines {
			pip.Process(p.GetPageItems(), this)
		}
	}

	this.sleep()
}
